--- title: esol keywords: fastai sidebar: home_sidebar summary: "Using `molmapnets` for regression, tested on the `eSOL` dataset." description: "Using `molmapnets` for regression, tested on the `eSOL` dataset." nb_path: "03_train.ipynb" ---
%config Completer.use_jedi = False
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_theme(palette='Set2')
colors = sns.color_palette()
colors
from chembench import dataset
from molmap import MolMap
from molmapnets.models import MolMapRegression
data = dataset.load_ESOL()
We have the smiles (Simplified Molecular Input Line Entry Specification) for different proteins and their corresponding solubility meansure:
data.df.head()
Using MolMap we can extract features from using the smiles as input. We can specify the feature type ftype, feature pairwise distance calculation method metric, and feature grid arrangement method fmap_type:
MolMap?
descriptor = MolMap(ftype='descriptor', metric='cosine',)
fingerprint = MolMap(ftype='fingerprint', metric='cosine')
After setting up the feature extracting method, we can then use the .fit method of the feature object to extract the features. During this step we need to specify the algorithm (method) to embed higher dimensional features to 2D presentation:
descriptor.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=15,)
fingerprint.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=10,)
And we can then visualise the feature maps
descriptor.plot_grid()
fingerprint.plot_grid()
descriptor.plot_scatter()
fingerprint.plot_scatter()
X = descriptor.batch_transform(data.x)
X.shape
In PyTorch the training data for computer vision problems takes the shape (n_channels, hight, width), while the features extracted from MolMap take the shape (hight, width, n_channels), so we'll first correct it by moving the channels dimension before the feature map dimensions.
torch.movedim(torch.from_numpy(X), -1, 1).shape
Y = data.y
Y.shape
Now from these feature maps we can create the dataset suitable for training models in PyTorch
esol = SingleFeatureData(data.y, X)
train, val, test = random_split(esol, [904,112,112], generator=torch.Generator().manual_seed(7))
len(train), len(val), len(test)
train_loader = DataLoader(train, batch_size=8, shuffle=True)
val_loader = DataLoader(val, batch_size=8, shuffle=True)
test_loader = DataLoader(test, batch_size=8, shuffle=True)
And we can get one batch of data by making the data loader iterable
x, t = next(iter(train_loader))
t
x.shape
Finally with the data prepared we can train the models. These are tests to show that the models work as expected, but we can certainly fine tune the model to achieve better results.
model = MolMapRegression()
epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
And the training loop
for epoch in range(epochs):
running_loss = 0.0
for i, (xb, yb) in enumerate(train_loader):
xb, yb = xb.to(device), yb.to(device)
# zero gradients
optimizer.zero_grad()
# forward propagation
pred = model(xb)
# loss calculation
loss = criterion(pred, yb)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (i+1) % 50 == 0:
print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
(epoch + 1, i + 1, running_loss / (i+1)))
print('Training finished')
Loss on validation data set
running_loss = 0.0
with torch.no_grad():
for i, (xb, yb) in enumerate(val_loader):
xb, yb = xb.to(device), yb.to(device)
# forward propagation
pred = model(xb)
# loss calculation
loss = criterion(pred, yb)
running_loss += loss.item()
if (i+1) % 3 == 0:
print('[Iter: %5d] Validation loss: %.3f' %
(i + 1, running_loss / (i+1)))
X_fingerprint = fingerprint.batch_transform(data.x)
X_fingerprint.shape
Now from these feature maps we can create the dataset suitable for training models in PyTorch
esol_fingerprint = SingleFeatureData(data.y, X_fingerprint)
train_fingerprint, val_fingerprint, test_fingerprint = random_split(esol_fingerprint, [904,112,112], generator=torch.Generator().manual_seed(7))
len(train), len(val), len(test)
train_loader_fingerprint = DataLoader(train_fingerprint, batch_size=8, shuffle=True)
val_loader_fingerprint = DataLoader(val_fingerprint, batch_size=8, shuffle=True)
test_loader_fingerprint = DataLoader(test_fingerprint, batch_size=8, shuffle=True)
And we can get one batch of data by making the data loader iterable
x, t = next(iter(train_loader_fingerprint))
t.shape
x.shape
And regression. Different feature maps have different number of channels.
model_fingerprint = MolMapRegression(conv_in1=12)
epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_fingerprint.to(device)
optimizer = optim.Adam(model_fingerprint.parameters(), lr=0.001)
criterion = nn.MSELoss()
And the training loop
for epoch in range(epochs):
running_loss = 0.0
for i, (xb, yb) in enumerate(train_loader_fingerprint):
xb, yb = xb.to(device), yb.to(device)
# zero gradients
optimizer.zero_grad()
# forward propagation
pred = model_fingerprint(xb)
# loss calculation
loss = criterion(pred, yb)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (i+1) % 50 == 0:
print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
(epoch + 1, i + 1, running_loss / (i+1)))
print('Training finished')
Loss on validation data set
running_loss = 0.0
with torch.no_grad():
for i, (xb, yb) in enumerate(val_loader_fingerprint):
xb, yb = xb.to(device), yb.to(device)
# forward propagation
pred = model_fingerprint(xb)
# loss calculation
loss = criterion(pred, yb)
running_loss += loss.item()
if (i+1) % 3 == 0:
print('[Iter: %5d] Validation loss: %.3f' %
(i + 1, running_loss / (i+1)))
Now we can feed both the feature maps to the model as a tuple
double_feature = DoubleFeatureData(data.y, (X, X_fingerprint))
train_double, val_double, test_double = random_split(double_feature, [904,112,112], generator=torch.Generator().manual_seed(7))
len(train_double), len(val_double), len(test_double)
train_loader_double = DataLoader(train_double, batch_size=8, shuffle=True)
val_loader_double = DataLoader(val_double, batch_size=8, shuffle=True)
test_loader_double = DataLoader(test_double, batch_size=8, shuffle=True)
And we can get one batch of data by making the data loader iterable
x, t = next(iter(train_loader_double))
t.shape
x1, x2 = x
x1.shape, x2.shape
And regression. Different feature maps have different number of channels.
model_double = MolMapRegression(conv_in1=13, conv_in2=12)
epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_double.to(device)
optimizer = optim.Adam(model_double.parameters(), lr=0.001)
criterion = nn.MSELoss()
And the training loop
for epoch in range(epochs):
running_loss = 0.0
for i, ((x1, x2), yb) in enumerate(train_loader_double):
x1, x2, yb = x1.to(device), x2.to(device), yb.to(device)
# zero gradients
optimizer.zero_grad()
# forward propagation
pred = model_double((x1, x2))
# loss calculation
loss = criterion(pred, yb)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (i+1) % 50 == 0:
print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
(epoch + 1, i + 1, running_loss / (i+1)))
print('Training finished')
Loss on validation data set
running_loss = 0.0
with torch.no_grad():
for i, ((x1, x2), yb) in enumerate(val_loader_double):
x1, x2, yb = x1.to(device), x2.to(device), yb.to(device)
# forward propagation
pred = model_double((x1, x2))
# loss calculation
loss = criterion(pred, yb)
running_loss += loss.item()
if (i+1) % 3 == 0:
print('[Iter: %5d] Validation loss: %.3f' %
(i + 1, running_loss / (i+1)))
print('Validation finished')